1. Male and female graduate enrollment (1967 - 2015). Compare trends in total graduate enrollment for males and females (including full-time/part-time and private/public universities) in the United States from 1967 - 2015. Describe your results statistically, graphically and in text.
What if we want to describe the way that one variable changes with respect to another variable? Then we might use regression to:
Understand how well input (x) variable(s) predict the value of an outcome (y) variable
How do input variables influence the output value?
Find a regression equation that MAY be used to make predictions for currently unknown values
4 assumptions for linear regression
Linearity
Independence
Homoscedasticity (residuals variance)
Normality
Linear Regression Steps:
Look at and think about your data: Does a linear relationship make the most sense, conceptually & visually?
Run the regression: lm(y ~ x, data = df)
Check the diagnostic plots: plot(model_name) – for evidence of homoscedasticity and normal residuals
Communicate the results in text/table/figure
# Read in dataset
enroll_totals <- read_csv("Grad enrollment 1967 - 2015.csv")
## Parsed with column specification:
## cols(
## year = col_integer(),
## total = col_integer(),
## total_males = col_integer(),
## total_females = col_integer()
## )
#### Test for linearity ####
plot(enroll_totals$year, enroll_totals$total_males)
# male enrollment increases at a linear trend by year.
plot(enroll_totals$year, enroll_totals$total_females)
# female enrollment increases at a linear trend by year
#### Run regressions ####
# linear regression for males predicted by year
males_lm <- lm(total_males ~ year, data = enroll_totals)
summary(males_lm)
##
## Call:
## lm(formula = total_males ~ year, data = enroll_totals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -96461 -34861 -12841 51876 95766
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17112153 1087024 -15.74 <2e-16 ***
## year 9069 546 16.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54050 on 47 degrees of freedom
## Multiple R-squared: 0.8545, Adjusted R-squared: 0.8514
## F-statistic: 276 on 1 and 47 DF, p-value: < 2.2e-16
# total_male_enrollment = -17112153 + 9069*year
# check diagnostic plots
par(mfrow = c(2,2))
plot(males_lm)
# residuals are normally distributed
# linear regression for females predicted by year
females_lm <- lm(total_females ~ year, data = enroll_totals)
summary(females_lm)
##
## Call:
## lm(formula = total_females ~ year, data = enroll_totals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -89397 -48101 -7633 45267 129727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.896e+07 1.161e+06 -50.77 <2e-16 ***
## year 3.013e+04 5.832e+02 51.66 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 57730 on 47 degrees of freedom
## Multiple R-squared: 0.9827, Adjusted R-squared: 0.9823
## F-statistic: 2669 on 1 and 47 DF, p-value: < 2.2e-16
females_lm
##
## Call:
## lm(formula = total_females ~ year, data = enroll_totals)
##
## Coefficients:
## (Intercept) year
## -58955502 30126
#total_female_enrollment = -5.896e+07 + 3.013e+04*year
# at year 0 for both models, we have negative enrollment, which is impossible. This means that we likely will not have a good model for past models.
# check diagnostic plots
par(mfrow = c(2,2))
plot(females_lm)
# residuals are normally distributed
# correlation testing for males vs year
cor.test(enroll_totals$year, enroll_totals$total_males)
##
## Pearson's product-moment correlation
##
## data: enroll_totals$year and enroll_totals$total_males
## t = 16.612, df = 47, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8690777 0.9568547
## sample estimates:
## cor
## 0.9243741
# strong positive correlation
# correlation testing for females vs year
cor.test(enroll_totals$year, enroll_totals$total_females)
##
## Pearson's product-moment correlation
##
## data: enroll_totals$year and enroll_totals$total_females
## t = 51.659, df = 47, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9845609 0.9951144
## sample estimates:
## cor
## 0.9913086
#even stronger positive correlation
# Need to interpret.
m_f_enroll_lmtbl <- stargazer(males_lm, females_lm, type = "html", out = "AIC.htm")
| Dependent variable: | ||
| total_males | total_females | |
| (1) | (2) | |
| year | 9,069.301*** | 30,126.030*** |
| (545.955) | (583.175) | |
| Constant | -17,112,153.000*** | -58,955,502.000*** |
| (1,087,024.000) | (1,161,132.000) | |
| Observations | 49 | 49 |
| R2 | 0.854 | 0.983 |
| Adjusted R2 | 0.851 | 0.982 |
| Residual Std. Error (df = 47) | 54,046.810 | 57,731.420 |
| F Statistic (df = 1; 47) | 275.952*** | 2,668.611*** |
| Note: | p<0.1; p<0.05; p<0.01 | |
# FINAL GRAPH: Make a time series graph with year on x axis and totals on y axis
totals_line <- ggplot(enroll_totals, aes(x = year)) +
geom_line(aes(y = total_males, color = "blue")) +
geom_line(aes(y = total_females, color = "red")) +
theme_classic() +
scale_x_continuous(expand = c(0,0), breaks = seq(1967, 2015, by = 5), limits = c(1967, 2015)) +
scale_y_continuous(breaks = seq(0, 2200000, by = 200000)) +
scale_color_manual(values = c("#00AFBB", "#E7B800"), labels = c("Males", "Females")) +
labs(x = "Year", y = "Enrollment Totals") +
theme(text = element_text(family = "Century Schoolbook"), legend.title=element_blank())
totals_line
# FINAL GRAPH: plot the regressions with geom_smooth
regression_graphs <- ggplot(enroll_totals, aes(x = year)) +
geom_point(aes(y = total_males, color = "blue")) +
geom_smooth(aes(y = total_males, color = "blue"), method = "lm", se = TRUE) +
geom_point(aes(y = total_females, color = "red")) +
geom_smooth(aes(y = total_females, color = "red"), method = "lm", se = TRUE) +
theme_classic() +
labs(x = "Year", y = "Enrollment Totals") +
scale_x_continuous(expand = c(0,0), breaks = seq(1967, 2015, by = 5), limits = c(1967, 2015)) +
scale_y_continuous(breaks = seq(0, 2200000, by = 200000)) +
scale_color_manual(values = c("skyblue1", "salmon"), labels = c("Males", "Females")) +
theme(text = element_text(family = "Century Schoolbook"), legend.title=element_blank()) +
annotate("text", x = 1997, y = 1600000, label = "y = -58,955,502 + 30,126x", family = "Century Schoolbook") +
annotate("text", x = 1997, y = 1500000, label = expression(R^2~"="~0.98) , family = "Century Schoolbook") + annotate("text", x = 1997, y = 1700000, label = "Females:", family = "Century Schoolbook") +
annotate("text", x = 2002, y = 775000, label = "y = -17,112,153 + 9,069x", family = "Century Schoolbook") +
annotate("text", x = 2002, y = 700000, label = expression(R^2~"="~0.85) , family = "Century Schoolbook") + annotate("text", x = 2002, y = 850000, label = "Males:", family = "Century Schoolbook")
regression_graphs
## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'expression'
## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'expression'
female: y = …
male: y = …
2. Shifts in female PhD recipients by field (1985, 2000, and 2015). Describe if and how there was a shift in PhDs awarded to females in four fields (Physical and Earth Sciences, Engineering, Education, and Humanities & Arts) in 1985, 2000, and 2015. Describe your results statistically, in a graph or table, and in text. Note: There are several ways that you can interpret this question. You are invited to decide which you think is/are most interesting. Just be really clear about what you are asking/answering in your report.
female_phd <- read_csv("PhDs by Field 1985 - 2015.csv")
## Parsed with column specification:
## cols(
## year = col_integer(),
## sciences = col_number(),
## engineering = col_number(),
## education = col_number(),
## humanities = col_number()
## )
female_phd1 <- select(female_phd, -year) # delete year column
rownames(female_phd1) <- c("1985", "2000", "2015") # name rows as year
## Warning: Setting row names on a tibble is deprecated.
female_phd_prop <- prop.table(as.matrix(female_phd1), 1) #create proportion table
female_phd_x2 <- chisq.test(female_phd1) #perform chisquare test
female_phd_x2 ####### ilayda add total enrollment column #######
##
## Pearson's Chi-squared test
##
## data: female_phd1
## X-squared = 2073.3, df = 6, p-value < 2.2e-16
# females moved out of education, and into the sciences and engineering (engineering especially). The humanities generally stayed the same.
# Ilayda make table adding in the total enrollment of females column
#adding stacked barchart to see if we want to use it
stacked_bar_data <- read_csv("female_phd_stackedbar.csv")
## Parsed with column specification:
## cols(
## year = col_number(),
## field_of_study = col_character(),
## degrees_awarded = col_number()
## )
stacked_bar_data$year <- as.character(stacked_bar_data$year)
stacked_bar <- ggplot(stacked_bar_data, aes(x = year, y = degrees_awarded, fill = field_of_study)) +
geom_bar(stat = "identity") +
theme_classic() +
scale_x_discrete(expand = c(0,0)) +
scale_y_continuous(expand = c(0,0)) +
theme(text = element_text(family = "Century Schoolbook")) +
scale_fill_discrete(name = "Field of Study") +
labs(x = "Year", y = "Degrees Awarded") +
annotate("text", x = 1, y = 270, label = "10.1%", family = "Century Schoolbook") +
annotate("text", x = 1, y = 1300, label = "24.6%", family = "Century Schoolbook") +
annotate("text", x = 1, y = 2075, label = "3.5%", family = "Century Schoolbook") +
annotate("text", x = 1, y = 3700, label = "61.8%", family = "Century Schoolbook") +
annotate("text", x = 2, y = 500, label = "11.7%", family = "Century Schoolbook") +
annotate("text", x = 2, y = 2400, label = "30.7%", family = "Century Schoolbook") +
annotate("text", x = 2, y = 4100, label = "9.6%", family = "Century Schoolbook") +
annotate("text", x = 2, y = 6500, label = "48.0%", family = "Century Schoolbook") +
annotate("text", x = 3, y = 900, label = "18.7%", family = "Century Schoolbook") +
annotate("text", x = 3, y = 3300, label = "26.7%", family = "Century Schoolbook") +
annotate("text", x = 3, y = 6000, label = "21.7%", family = "Century Schoolbook") +
annotate("text", x = 3, y = 8800, label = "32.9%", family = "Century Schoolbook")
stacked_bar
3. Male and female salaries for starting postdoctoral and other employment positions (2015). Compare median salaries for male and female doctorate recipients in 2015. Answer these two questions: Does median salary differ significantly between male and female starting postdoc positions? Does median salary differ significantly between male and female PhD recipients in non-postdoc employment positions?
First test for equal variance
wilcoxon signed rank (paired) ??
Cliffs Delta (effect size)
Actual difference in medians – does this apply here? I don’t think taking the median of medians makes sense??
med_sal <- read_csv("Median salary for doctoral recipients cleaned.csv")
## Parsed with column specification:
## cols(
## field_of_study = col_character(),
## p_sal_m = col_number(),
## p_sal_f = col_number(),
## np_sal_m = col_number(),
## np_sal_f = col_number()
## )
######################## Post Doctoral Work ############################
# Test for equal variance
# H0: Variances are equal
# HA: Variances are not equal
var_test_p <- var.test(med_sal$p_sal_m, med_sal$p_sal_f)
var_test_p
##
## F test to compare two variances
##
## data: med_sal$p_sal_m and med_sal$p_sal_f
## F = 0.90255, num df = 14, denom df = 14, p-value = 0.8506
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.3030138 2.6883336
## sample estimates:
## ratio of variances
## 0.9025532
# retain the null hypothesis that the variances are equal
# Exploratory graphs
# hist for male median salary postdoctoral
p_male_hist <- ggplot(aes(x = p_sal_m), data = med_sal) +
geom_histogram(bins = 4.932424)
p_male_hist
# seems relatively normal
p_female_hist <- ggplot(aes(x = p_sal_f), data = med_sal) +
geom_histogram(bins = 4.932424)
p_female_hist
# seems relatively normal
# H0: There is not a significant difference in ranks for male and female post doc positions
# HA: There is a significant difference in ranks for male and female post doc positions
wsr_salaries_p <- wilcox.test(med_sal$p_sal_m, med_sal$p_sal_f, paired = TRUE)
## Warning in wilcox.test.default(med_sal$p_sal_m, med_sal$p_sal_f, paired =
## TRUE): cannot compute exact p-value with ties
## Warning in wilcox.test.default(med_sal$p_sal_m, med_sal$p_sal_f, paired =
## TRUE): cannot compute exact p-value with zeroes
wsr_salaries_p
##
## Wilcoxon signed rank test with continuity correction
##
## data: med_sal$p_sal_m and med_sal$p_sal_f
## V = 19.5, p-value = 0.8884
## alternative hypothesis: true location shift is not equal to 0
# p = 0.8671
# this tells us to retain null. There is no significant diff between male and female post doc positions among field.
# Cliffs delta
p_sal_delta <- cliff.delta(med_sal$p_sal_m, med_sal$p_sal_f)
p_sal_delta
##
## Cliff's Delta
##
## delta estimate: 0.04 (negligible)
## 95 percent confidence interval:
## inf sup
## -0.3710946 0.4379849
# effect size is 0.04 (negligible)
# Actual difference in medians
# median(med_sal$p_sal_m) - median(med_sal$p_sal_f)
# difference = 3000
med_sal_p <- med_sal %>%
select(field_of_study, p_sal_m, p_sal_f) %>%
mutate(difference = p_sal_m - p_sal_f)
########################## non postdoctoral work #########################################
# Test for equal variance
# H0: Variances are equal
# HA: Variances are not equal
var_test_np <- var.test(med_sal$np_sal_m, med_sal$np_sal_f)
var_test_np
##
## F test to compare two variances
##
## data: med_sal$np_sal_m and med_sal$np_sal_f
## F = 1.0856, num df = 14, denom df = 14, p-value = 0.88
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.3644779 3.2336422
## sample estimates:
## ratio of variances
## 1.085629
# retain the null hypothesis that the variances are equal
# Exploratory graphs
# hist for male median salary postdoctoral
np_male_hist <- ggplot(aes(x = np_sal_m), data = med_sal) +
geom_histogram(bins = 4.932424)
np_male_hist
# seems relatively normal
np_female_hist <- ggplot(aes(x = np_sal_f), data = med_sal) +
geom_histogram(bins = 4.932424)
np_female_hist
# seems relatively normal
# H0: There is not a significant difference in ranks for male and female non post doc positions
# HA: There is a significant difference in ranks for male and female non post doc positions
wsr_salaries_np <- wilcox.test(med_sal$np_sal_m, med_sal$np_sal_f, paired = TRUE)
## Warning in wilcox.test.default(med_sal$np_sal_m, med_sal$np_sal_f, paired =
## TRUE): cannot compute exact p-value with ties
## Warning in wilcox.test.default(med_sal$np_sal_m, med_sal$np_sal_f, paired =
## TRUE): cannot compute exact p-value with zeroes
wsr_salaries_np
##
## Wilcoxon signed rank test with continuity correction
##
## data: med_sal$np_sal_m and med_sal$np_sal_f
## V = 101, p-value = 0.002572
## alternative hypothesis: true location shift is not equal to 0
# p = 0.002572
# this tells us to reject null. There is significant diff between male and female non post doc positions among field of study.
# Cliffs delta
np_sal_delta <- cliff.delta(med_sal$np_sal_m, med_sal$np_sal_f)
np_sal_delta
##
## Cliff's Delta
##
## delta estimate: 0.2133333 (small)
## 95 percent confidence interval:
## inf sup
## -0.2155378 0.5732121
# effect size is 0.213333 (SMALL)
# Actual difference in medians
# median(med_sal$np_sal_m) - median(med_sal$np_sal_f)
# difference = 3417
med_sal_np <- med_sal %>%
select(field_of_study, np_sal_m, np_sal_f) %>%
mutate(difference = np_sal_m - np_sal_f)
# read in new med_sal data
med_sal_bar <- read_csv("med_sal_bar.csv")
## Parsed with column specification:
## cols(
## field_of_study = col_character(),
## salary_np = col_number(),
## gender = col_character(),
## salary_p = col_number()
## )
# column graph of differences for non postdoctoral work
np_bar_salaries <- ggplot(med_sal_bar, aes(x = field_of_study, y = salary_np, fill = gender)) +
geom_col(stat = "identity", position = "dodge") +
coord_flip() +
theme_classic() +
scale_x_discrete(expand = c(0,0)) +
scale_y_continuous(expand = c(0,0)) +
labs(x = "Field of Study", y = "Median Salary ($)") +
theme(text = element_text(family = "Century Schoolbook"), legend.title=element_blank()) +
scale_fill_manual( values = c("salmon", "skyblue1"),labels = c("Female", "Male"))
## Warning: Ignoring unknown parameters: stat
np_bar_salaries
# column graph of differences for postdoctoral work
p_bar_salaries <- ggplot(med_sal_bar, aes(x = field_of_study, y = salary_p, fill = gender)) +
geom_col(stat = "identity", position = "dodge") +
coord_flip() +
theme_classic() +
scale_x_discrete(expand = c(0,0)) +
scale_y_continuous(expand = c(0,0)) +
labs(x = "Field of Study", y = "Median Salary ($)") +
scale_fill_manual(values = c("salmon", "skyblue1"),labels = c("Female", "Male")) +
theme(text = element_text(family = "Century Schoolbook"), legend.title=element_blank())
## Warning: Ignoring unknown parameters: stat
p_bar_salaries
4. Exploring academic salaries for professors in U.S. colleges. Explore relationships between variables in the ‘Faculty salary data (2008 - 2009 survey)’ dataset. Develop a model describing faculty salary based on data for faculty sex, rank, years in current position, field, and number of years since doctoral degree was earned. You should make decisions regarding which variables should remain in your final model. Describe the results qualitatively and quantitatively (i.e., don’t just report the statistical results of the model – make sure you describe interesting findings in text). You can also discuss any concerns that you have with the model(s) you present, if any.
H0: The model does not significantly predict the outcome (i.e., there is no relationship between the model and the outcome; no variables significantly predict outcome) HA: AT LEAST ONE of the explanatory variables significantly predicts the outcome
How do I know which model is the BEST one?
STATISTICS IS NO SUBSTITUTE FOR JUDGMENT.
ASSUMPTIONS.
AKAIKE INFORMATION CRITERION.
Signs and Symptoms of Collinearity
High correlation between predictor variables
Large standard errors for regression coefficient
Large changes in model output for small changes in predictor variable inputs
Illogical/unexpected coefficients (esp. coefficient sign)
Variance Inflation Factor
How much the variance in coefficients is inflated (compared to the variance if there were no correlation between model variables)
VIF = 1: No correlation (variance is not inflated at all by correlation with other predictor variables)
VIF > 4: Eeehhh…some correlation, investigate further and think critically about possible overlaps in explanatory variables
VIF > 10: Serious correlation between variables that may be strongly inflating coefficient variance and error
faculty_salary <- read_csv("Faculty salary data (2008 - 2009 survey).csv")
## Parsed with column specification:
## cols(
## rank = col_character(),
## field = col_character(),
## years_since = col_integer(),
## years_service = col_integer(),
## sex = col_character(),
## salary = col_integer()
## )
# look at relationships between variables
plot(faculty_salary$years_since, faculty_salary$salary) # looks ok
plot(faculty_salary$years_service, faculty_salary$salary) # looks ok
plot(faculty_salary$years_since, faculty_salary$years_service) # definitely colinear
# a. Set rank to class 'factor'
faculty_salary$rank <- as.factor(faculty_salary$rank)
# d. Reassign reference level of "rank" to "AsstProf":
faculty_salary$rank <- fct_relevel(faculty_salary$rank, "AsstProf")
levels(faculty_salary$rank)
## [1] "AsstProf" "AssocProf" "Prof"
# a. Set field to class 'factor'
faculty_salary$field <- as.factor(faculty_salary$field)
# d. Reassign reference level of "Status" to "Regular":
faculty_salary$field <- fct_relevel(faculty_salary$field, "Theoretical")
levels(faculty_salary$field)
## [1] "Theoretical" "Applied"
# saturated model
test_lm1 <- lm(salary ~ rank + field + years_since + years_service + sex, data = faculty_salary)
summary(test_lm1)
##
## Call:
## lm(formula = salary ~ rank + field + years_since + years_service +
## sex, data = faculty_salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65248 -13211 -1775 10384 99592
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65955.2 4588.6 14.374 < 2e-16 ***
## rankAssocProf 12907.6 4145.3 3.114 0.00198 **
## rankProf 45066.0 4237.5 10.635 < 2e-16 ***
## fieldApplied 14417.6 2342.9 6.154 1.88e-09 ***
## years_since 535.1 241.0 2.220 0.02698 *
## years_service -489.5 211.9 -2.310 0.02143 *
## sexMale 4783.5 3858.7 1.240 0.21584
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22540 on 390 degrees of freedom
## Multiple R-squared: 0.4547, Adjusted R-squared: 0.4463
## F-statistic: 54.2 on 6 and 390 DF, p-value: < 2.2e-16
plot(test_lm1)
# heteroskedastic
vif(test_lm1)
## GVIF Df GVIF^(1/(2*Df))
## rank 2.013193 2 1.191163
## field 1.064105 1 1.031555
## years_since 7.518936 1 2.742068
## years_service 5.923038 1 2.433729
## sex 1.030805 1 1.015285
# years since = 7.5
# years service = 5.9
# just as we expected, most likely colinear. Should be concerned. Lets try taking out years since phd.
test_lm2 <- lm(salary ~ rank + field + years_service + sex, data = faculty_salary)
summary(test_lm2)
##
## Call:
## lm(formula = salary ~ rank + field + years_service + sex, data = faculty_salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -64202 -14255 -1533 10571 99163
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 68351.67 4482.20 15.250 < 2e-16 ***
## rankAssocProf 14560.40 4098.32 3.553 0.000428 ***
## rankProf 49159.64 3834.49 12.820 < 2e-16 ***
## fieldApplied 13473.38 2315.50 5.819 1.24e-08 ***
## years_service -88.78 111.64 -0.795 0.426958
## sexMale 4771.25 3878.00 1.230 0.219311
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22650 on 391 degrees of freedom
## Multiple R-squared: 0.4478, Adjusted R-squared: 0.4407
## F-statistic: 63.41 on 5 and 391 DF, p-value: < 2.2e-16
plot(test_lm2)
#heteroskedastic
vif(test_lm2)
## GVIF Df GVIF^(1/(2*Df))
## rank 1.597441 2 1.124233
## field 1.029040 1 1.014416
## years_service 1.627110 1 1.275582
## sex 1.030803 1 1.015284
# everything is ok here
test_lm3 <- lm(salary ~ rank + field + years_since + sex, data = faculty_salary)
summary(test_lm3)
##
## Call:
## lm(formula = salary ~ rank + field + years_since + sex, data = faculty_salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67451 -13860 -1549 10716 97023
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 67884.32 4536.89 14.963 < 2e-16 ***
## rankAssocProf 13104.15 4167.31 3.145 0.00179 **
## rankProf 46032.55 4240.12 10.856 < 2e-16 ***
## fieldApplied 13937.47 2346.53 5.940 6.32e-09 ***
## years_since 61.01 127.01 0.480 0.63124
## sexMale 4349.37 3875.39 1.122 0.26242
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22660 on 391 degrees of freedom
## Multiple R-squared: 0.4472, Adjusted R-squared: 0.4401
## F-statistic: 63.27 on 5 and 391 DF, p-value: < 2.2e-16
plot(test_lm3)
##### use this model^^^^^
### check AIC to determine which is better. Still we should trust our judgement here though
AIC(test_lm1)
## [1] 9093.826
#9093.826
AIC(test_lm2)
## [1] 9096.813
#9096.813
AIC(test_lm3)
## [1] 9097.22
#9097.22
ggplot(aes(x = years_service, y = salary, color = field), data = faculty_salary) +
geom_point() +
geom_smooth(aes(years_service, salary, color = field), method = lm, se = FALSE)
ggplot(aes(x = years_since, y = salary, color = field), data = faculty_salary) +
geom_point() +
geom_smooth(aes(years_since, salary, color = field), method = lm, se = FALSE) +
theme_classic() +
labs(x = "Years Since PHD", y = "Salary", color = "Field") +
theme(text = element_text(family = "Century Schoolbook"))
ggplot(aes(x = years_service, y = salary, color = rank), data = faculty_salary) +
geom_point() +
geom_smooth(aes(years_service, salary, color = rank), method = lm, se = FALSE)
ggplot(aes(x = years_since, y = salary, color = rank), data = faculty_salary) +
geom_point() +
geom_smooth(aes(years_since, salary, color = rank), method = lm, se = FALSE)
##interaction term for field and years_since
test_lm4 <- lm(salary ~ rank + field + years_since + sex + years_since*field, data = faculty_salary)
summary(test_lm4)
##
## Call:
## lm(formula = salary ~ rank + field + years_since + sex + years_since *
## field, data = faculty_salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69074 -14118 -1386 10575 95921
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69396.694 5111.577 13.576 < 2e-16 ***
## rankAssocProf 12916.953 4180.553 3.090 0.00215 **
## rankProf 45684.696 4277.531 10.680 < 2e-16 ***
## fieldApplied 11286.218 4739.175 2.381 0.01772 *
## years_since 8.332 151.148 0.055 0.95607
## sexMale 4464.127 3882.387 1.150 0.25091
## fieldApplied:years_since 117.789 182.886 0.644 0.51991
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22680 on 390 degrees of freedom
## Multiple R-squared: 0.4478, Adjusted R-squared: 0.4393
## F-statistic: 52.71 on 6 and 390 DF, p-value: < 2.2e-16
vif(test_lm4)
## GVIF Df GVIF^(1/(2*Df))
## rank 2.019919 2 1.192157
## field 4.299846 1 2.073607
## years_since 2.920851 1 1.709050
## sex 1.030530 1 1.015150
## field:years_since 4.555340 1 2.134324
AIC(test_lm4)
## [1] 9098.798
#9098.798
##interaction term for rank and years_since
test_lm5 <- lm(salary~ rank + field + years_since + sex + years_since*rank, data = faculty_salary)
summary(test_lm5)
##
## Call:
## lm(formula = salary ~ rank + field + years_since + sex + years_since *
## rank, data = faculty_salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -68714 -13522 -1724 10716 96337
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 68412.9 7392.3 9.255 < 2e-16 ***
## rankAssocProf 17767.8 8254.5 2.152 0.032 *
## rankProf 43639.4 7504.1 5.815 1.26e-08 ***
## fieldApplied 13934.1 2350.2 5.929 6.74e-09 ***
## years_since -11.1 1102.1 -0.010 0.992
## sexMale 4159.9 3886.6 1.070 0.285
## rankAssocProf:years_since -253.4 1139.8 -0.222 0.824
## rankProf:years_since 144.3 1110.0 0.130 0.897
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22680 on 389 degrees of freedom
## Multiple R-squared: 0.4493, Adjusted R-squared: 0.4394
## F-statistic: 45.34 on 7 and 389 DF, p-value: < 2.2e-16
##interaction term for rank and years_service
test_lm6 <- lm(salary~ rank + field + years_service + sex + years_service*rank, data = faculty_salary)
summary(test_lm6)
##
## Call:
## lm(formula = salary ~ rank + field + years_service + sex + years_service *
## rank, data = faculty_salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65007 -14267 -1367 10853 98612
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65940.9 6375.2 10.343 < 2e-16 ***
## rankAssocProf 19680.3 6839.7 2.877 0.00423 **
## rankProf 50761.6 6062.2 8.373 1.02e-15 ***
## fieldApplied 13471.4 2318.4 5.811 1.30e-08 ***
## years_service 935.6 1867.1 0.501 0.61660
## sexMale 4748.7 3886.3 1.222 0.22248
## rankAssocProf:years_service -1249.3 1888.4 -0.662 0.50864
## rankProf:years_service -987.9 1871.2 -0.528 0.59783
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22680 on 389 degrees of freedom
## Multiple R-squared: 0.4492, Adjusted R-squared: 0.4393
## F-statistic: 45.33 on 7 and 389 DF, p-value: < 2.2e-16
##interaction term for field and years_service
test_lm7 <- lm(salary~ rank + field + years_service + sex + years_service*field, data = faculty_salary)
summary(test_lm7)
##
## Call:
## lm(formula = salary ~ rank + field + years_service + sex + years_service *
## field, data = faculty_salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -70632 -14191 -2098 9937 94331
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 72438.8 4805.6 15.074 < 2e-16 ***
## rankAssocProf 13899.7 4086.8 3.401 0.000741 ***
## rankProf 48021.5 3846.7 12.484 < 2e-16 ***
## fieldApplied 6255.7 3916.2 1.597 0.110989
## years_service -266.8 135.8 -1.965 0.050128 .
## sexMale 5196.1 3861.9 1.345 0.179254
## fieldApplied:years_service 406.3 178.3 2.279 0.023221 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22530 on 390 degrees of freedom
## Multiple R-squared: 0.455, Adjusted R-squared: 0.4467
## F-statistic: 54.27 on 6 and 390 DF, p-value: < 2.2e-16
stargazer(test_lm3, type = "html")
| Dependent variable: | |
| salary | |
| rankAssocProf | 13,104.150*** |
| (4,167.315) | |
| rankProf | 46,032.550*** |
| (4,240.120) | |
| fieldApplied | 13,937.470*** |
| (2,346.534) | |
| years_since | 61.011 |
| (127.010) | |
| sexMale | 4,349.366 |
| (3,875.393) | |
| Constant | 67,884.320*** |
| (4,536.892) | |
| Observations | 397 |
| R2 | 0.447 |
| Adjusted R2 | 0.440 |
| Residual Std. Error | 22,663.240 (df = 391) |
| F Statistic | 63.266*** (df = 5; 391) |
| Note: | p<0.1; p<0.05; p<0.01 |
######################## THIS IS NOT RIGHT ################################
#predict values and graph
df_new <- data.frame(rank = rep(c("Prof",
"AsstProf",
"AssocProf"),
times = 12,
each = 20),
field = rep(c("Applied",
"Theoretical"),
each = 360),
years_since = rep(seq(from = 1,
to = 60,
length = 20),
times = 12),
sex = rep(c("Male",
"Female"),
each = 360),
salary = rep(seq(from = 55000,
to = 235000,
length = 20),
times = 12))
# a. Make predictions using the new data frame:
salary_predict <- as.data.frame(predict(test_lm4, newdata = df_new, se.fit = TRUE, interval = "confidence"))
# b. Bind predictions to the data to make it actually useful:
predict_df <- data.frame(df_new, salary_predict)
#line graph of predictions
predict_graph <- ggplot(predict_df, aes(x = years_since, y = fit.fit)) +
geom_line(aes(color = rank)) +
facet_wrap(~field)
predict_graph